Workflow to ID motifs and match to genes

(Full implementation in Snakefile)

  1. Download TF motifs from cisbp
  2. Only keep TF which are abs(logFc) > 1 between GFP/RPE and iPSC (~1200)
  3. Check for motifs in narrow ATAC-seq peaks with fimo
  4. Identify closest 2 genes (under 500k bp) to each motif
  5. Bootstrap steps 2 and 3 250 times each to get background rate
# Load Libraries without printing any warnings or messages
library(tidyverse)
library(ggridges)
library(cowplot)
library(DESeq2)
library(ggrepel)

Motif/TF assessed

# Load in real data
samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/fimo_motifs/', pattern = '*dat.gz', full.names = T)
fimo_data <- list()
for (i in samples_full){
  sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
  input <- read_tsv(i)
  input <- input %>% mutate(sequence_name = as.character(sequence_name)) %>% mutate(sample = sample)
  fimo_data[[sample]] <- input
}
sample_motifs = bind_rows(fimo_data) %>% mutate(sample=as.factor(sample))
#sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% arrange(-Count)
tf_motif <- sample_motifs %>% select('TF' = `# motif_id`, motif_alt_id) %>% unique()
tf_motif %>% DT::datatable()
# merge sample counts with bootstrap counts
all <- bind_rows(sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% ungroup() %>% mutate(bootstrap = 'real'),
                 bootstrap_counts)
# motif ggridges plotter
plotter <- function(motif, scale=TRUE) {
  if (scale == TRUE){
    all <- all %>% filter(motif_alt_id == !!motif) %>% 
      group_by(sample, motif_alt_id) %>% 
      mutate(`Z score` = scale(Count)) %>% 
      ungroup() 
    all %>% 
      mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>% 
      ggplot(aes(x=`Z score`, y=sample)) + 
      geom_density_ridges() +
      geom_point(data = all %>% 
                   filter(bootstrap == 'real', motif_alt_id == !!motif) %>% 
                   ungroup(), aes(x=`Z score`,y=sample), colour='blue', size=2, alpha=0.5) +
      scale_y_discrete(expand = c(0.01, 0)) +
      theme_ridges() + 
      ggtitle(paste0(motif, ' (', 
                     tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
                     ')'))
  }
  else {
    all %>% filter(motif_alt_id == !!motif) %>% 
      mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>% 
      ggplot(aes(x=Count, y=sample)) + 
      geom_density_ridges() +
      geom_point(data = sample_motifs %>% 
                   filter(motif_alt_id == !!motif) %>% 
                   group_by(sample, motif_alt_id) %>%
                   summarise(Count=n()) %>% ungroup(), aes(x=Count,y=sample), colour='blue', size=2, alpha=0.5) +
      scale_y_discrete(expand = c(0.01, 0)) +
      theme_ridges() + 
      ggtitle(paste0(motif, ' (', 
                     tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
                     ')'))
  }
}

Plot enrichment specific motif/TFBS

TET1

plotter('M0610_1.02')

DNMT1

plotter('M0609_1.02')

PAX6

plotter('M6410_1.02')

SOX10

plotter('M6470_1.02')

MITF

plotter('M6345_1.02')

HES1

plotter('M6271_1.02')

SIX3

plotter('M1016_1.02')

SIX2

plotter('M0952_1.02')

Distribution of TFBS/motif enrichment

Fold Change

delta_motif_FC <- all %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                                                    grepl('RFP', sample) ~ 'RFP',
                                                    TRUE ~ 'iPSC'),
                                   Boot = case_when(bootstrap == 'real' ~ 'real',
                                                    TRUE ~ 'boot')) %>% 
  group_by(Type, motif_alt_id, Boot) %>% 
  mutate(`Z score` = scale(Count)) %>% 
  summarise(Mean=mean(Count)) %>% 
  spread(Boot, Mean) %>%
  ungroup() %>% 
  mutate(FC = real / boot, Type = factor(Type, levels=c('iPSC','RFP','GFP'))) %>% 
  arrange(-FC) %>% 
  left_join(tf_motif) %>% 
  select(-boot, -real) 
Joining, by = "motif_alt_id"
delta_motif_FC %>% ggplot(aes(y=Type, x=log2(FC))) + geom_density_ridges() + theme_ridges() 

delta_motif_FC %>% ggplot(aes(y=Type, x=FC)) + geom_density_ridges() + theme_ridges() 

# calculate z scores for all
Z_scoring <- all %>% 
  # collapse by Type, motif, and whether bootstrap or real
  group_by(sample, motif_alt_id) %>% 
  mutate(`Z score` = scale(Count)) %>% 
  filter(bootstrap == 'real') %>% 
  select(-bootstrap, -TF) %>% 
  left_join(tf_motif)
Joining, by = "motif_alt_id"
Z_scoring %>% head()

Distribution of relative TFBS/motif enrichment between GFP <-> RFP

FC_motif_FC <- all %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                                                    grepl('RFP', sample) ~ 'RFP',
                                                    TRUE ~ 'iPSC'),
                                   Boot = case_when(bootstrap == 'real' ~ 'real',
                                                    TRUE ~ 'boot')) %>% 
  # collapse by Type, motif, and whether bootstrap or real
  group_by(Type, motif_alt_id, Boot) %>% 
  summarise(Mean=mean(Count)) %>% 
  # go wide
  spread(Boot, Mean) %>%
  # so you can calculate FC
  mutate(FC = real / boot) %>% 
  arrange(-FC) %>% 
  # add motif name
  left_join(tf_motif) %>% 
  select(-boot, -real) %>%
  # spread again to allow for GFP vs RFP or iPSC comparisons
  spread(Type, FC) %>% 
  filter(GFP > 1.2) %>% 
  mutate(`FC^2 GFP <-> iPSC` = GFP / iPSC, `FC^2 GFP <-> RFP` = GFP / RFP) %>% 
  arrange(-`FC^2 GFP <-> RFP`) 
Joining, by = "motif_alt_id"
plot1 <- FC_motif_FC %>% 
  ggplot(aes(x=`FC^2 GFP <-> RFP`)) + 
  geom_density(fill='gray') + 
  theme_minimal() + 
  geom_vline(xintercept = 1.2, color='blue')
plot1

Distribution of relative TFBS/motif enrichment between GFP <-> iPSC

plot2 <- FC_motif_FC %>% 
  ggplot(aes(x=`FC^2 GFP <-> iPSC`)) + 
  geom_density(fill='gray') + 
  theme_minimal()+ 
  geom_vline(xintercept = 1.2, color = 'blue')
plot2 

Both together

plot_grid(plot1 + coord_cartesian(xlim=c(0.5,5.5)), plot2 + coord_cartesian(xlim=c(0.5,5.5)), nrow = 2, align = 'hv', axis='tblr')

DESeq2-based analysis

I realized that motifs by samples is similar to genes by samples

We are dropping RFP II E because from the plots, seems rarely enriched in anything

cts <- all %>% filter(bootstrap=='real') %>% spread(sample, Count) %>% select(-bootstrap, -TF) %>% data.frame()
row.names(cts) <- cts$motif_alt_id
cts %>% head()
cts <- cts[,-1]
coldata <- all %>% select(sample) %>% unique() %>% filter(sample!='RFP_IIE_1') %>% mutate(Type =  case_when(grepl('GFP', sample) ~ 'GFP',
                                                                            grepl('RFP', sample) ~ 'RFP',
                                                                            TRUE ~ 'iPSC'))
dds <- DESeqDataSetFromMatrix(countData = cts %>% select(one_of(as.character(coldata$sample)) ),
                              colData = coldata,
                              design = ~ Type)
some variables in design formula are characters, converting to factors
dds$Type <- relevel(dds$Type, ref='RFP')
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
res <- results(DESeq(dds))
using pre-existing size factors
estimating dispersions
found already estimated dispersions, replacing these
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
resLFC <- lfcShrink(dds, coef='Type_GFP_vs_RFP', type='apeglm')
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    bioRxiv. https://doi.org/10.1101/303255
plotMA(resLFC)

plot(hist(resLFC$pvalue))

Table of results

The Z score is the number of standard deviation of motifs found in the sample peaks above the random set of peaks

results <- resLFC %>% 
  data.frame() %>% 
  rownames_to_column('motif_alt_id') %>% 
  left_join(tf_motif) %>% 
  arrange(pvalue) %>% 
  data.frame() %>% 
  left_join(Z_scoring %>%  mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                          grepl('RFP', sample) ~ 'RFP',
                          TRUE ~ 'iPSC')) %>% 
              filter(Type=='GFP') %>% group_by(motif_alt_id) %>% summarise(`Z score` = mean(`Z score`))) %>% 
  left_join(gfp_rfp %>% mutate(TF=Gene, log2FC_GFP_RFP_RNASeq = log2FoldChange) %>% select(TF, log2FC_GFP_RFP_RNASeq)) 
Joining, by = "motif_alt_id"
Joining, by = "motif_alt_id"
Joining, by = "TF"
results <- resLFC %>% 
  data.frame() %>% 
  rownames_to_column('motif_alt_id') %>% 
  left_join(tf_motif) %>% 
  arrange(pvalue) %>% 
  data.frame() %>% 
  left_join(Z_scoring %>%  mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                          grepl('RFP', sample) ~ 'RFP',
                          TRUE ~ 'iPSC')) %>% 
              filter(Type=='GFP') %>% group_by(motif_alt_id) %>% summarise(`Z score` = mean(`Z score`))) %>% 
  left_join(gfp_rfp %>% mutate(TF=Gene, log2FC_GFP_RFP_RNASeq = log2FoldChange) %>% select(TF, log2FC_GFP_RFP_RNASeq)) 
Joining, by = "motif_alt_id"
Joining, by = "motif_alt_id"
Joining, by = "TF"
results %>% filter((`Z score`) > 2.5) %>% arrange(pvalue) %>% DT::datatable()

Volcano

volcano_maker <- function(df, title){
  df$Class <- 'Not significant'
  df$Class[df$padj < 0.01 & df$log2FoldChange > 0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
  df$Class[df$padj < 0.01 & df$log2FoldChange < -0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
  df$Class <- factor(df$Class, levels=c('Not significant', "FDR < 0.01 &\nabs(Z score) > 2.5"))
  plot <- ggplot(data=df,aes(x=log2FoldChange,y=-log10(pvalue))) + 
    geom_point(aes(colour=Class, size=`Z score`), alpha=0.5) +
    scale_colour_manual(values=c("gray","darkred")) + 
    geom_text_repel(data=df %>% filter((padj < 0.01 & log2FoldChange > 0.4 & `Z score` > 2.5) | (padj < 0.01 & log2FoldChange < -0.4 & `Z score` > 2.5)), 
                    aes(label=TF)) +
    # geom_vline(aes(xintercept=-0.5),linetype="dotted") +
    # geom_vline(aes(xintercept=0.5),linetype="dotted") +
    scale_x_continuous(breaks=c(-1,-0.5,0,0.5,1)) +
    ggtitle(title) + theme_minimal()
  return(plot)
}
volcano_maker(results, 'GFP vs RFP TFBS motif counts')

Plots of the top motifs different between GFP and RFP

RFP II E is plotted here, but not used in the differential analysis above

plots <- list()
for (i in results %>% filter((`Z score`) > 2) %>% arrange(pvalue) %>% head(n=12) %>% pull(motif_alt_id)){
  plots[[i]] <- plotter(i, scale=T)
}
plot_grid(plotlist = plots, ncol=3)
Picking joint bandwidth of 0.247
Picking joint bandwidth of 0.26
Picking joint bandwidth of 0.26
Picking joint bandwidth of 0.263
Picking joint bandwidth of 0.252
Picking joint bandwidth of 0.252
Picking joint bandwidth of 0.252
Picking joint bandwidth of 0.268
Picking joint bandwidth of 0.274
Picking joint bandwidth of 0.268
Picking joint bandwidth of 0.258
Picking joint bandwidth of 0.28

# Gene <-> motif matching
# load gtf to map transcript to gene_name

gtf <- read_tsv("/Volumes/data/genomes/1000G_phase2_GRCh37/gencode.v28lift37.metadata.HGNC.gz", col_names = c('Transcript','Gene'))
# gtf <- gtf %>% rowwise() %>% mutate(transcript = ((str_split(V9, ';')[[1]] %>% str_extract('transcript_id.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2],
#                              gene = ((str_split(V9, ';')[[1]] %>% str_extract('gene_name.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2])
## Sample data
### Closest TSS for OTX2 (M5700_1.02)

# samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/closest_TSS_motifs/', 
#                           pattern = '*dat.gz', full.names = T)
# 
# closestTSS_data <- list()
# for (i in samples_full){
#   sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
#  # input <- read_tsv(i, col_names = c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript',  'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance'))
#   input <- fread(paste('gzcat', i, '| grep M5700_1'))
#   colnames(input) <- c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript',  'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance')
#   input <- input %>% mutate(sequence_name = as.character(sequence_name),
#                             tss_seq = as.character(tss_seq),
#                             coord1=as.numeric(coord1),
#                             coord2=as.numeric(coord2),
#                             blank2 = as.character(blank2),
#                             exon_num=as.integer(exon_num),
#                             size=as.numeric(size)) %>% mutate(sample = sample)
#   closestTSS_data[[sample]] <- input
# }
# sample_closestTSS = bind_rows(closestTSS_data) %>% mutate(sample=as.factor(sample)) %>% rowwise() %>% mutate(Transcript = str_split(transcript, '_')[[1]][1])
# 
# both <- left_join(sample_closestTSS, gtf) %>% 
#   filter(!is.na(Gene)) %>% 
#   mutate(motif_loc = paste(sequence_name, start, end, sep='_'))
# 
# both %>% 
#   # remove any TSS over 500kb away
#   filter(distance < 500000) %>% 
#   #  one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # add up to two genes (total) per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), paste(motif_loc, collapse=', ')) %>% 
#   arrange(-Count)

# What genes have more PAX6 'associated' motifs compared from GFP to RFP

# enriched_genes <- both %>% 
#   # only keep one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # keep up to two genes per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), paste(motif_loc, collapse=', ')) %>% 
#   ungroup() %>% 
#   # collapse to GFP/RFP/iPSC
#   mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
#                           grepl('RFP', sample) ~ 'RFP',
#                           TRUE ~ 'iPSC')) %>% 
#   group_by(Gene, Type) %>% 
#   summarise(Total=sum(Count)) %>% 
#   arrange(-Total) %>% 
#   spread(Gene, Total) %>% t() 
# 
# colnames(enriched_genes) <- enriched_genes[1,]
# enriched_genes <- enriched_genes[-1,] %>% data.frame() %>% rownames_to_column('Gene') %>%  mutate(GFP = as.numeric(GFP), iPSC = as.numeric(iPSC), RFP = as.numeric((RFP)))
# 
# enriched_genes[is.na(enriched_genes)] <- 0
# 
# enriched_genes %>% mutate(`deltaGFP <-> RFP` = GFP - RFP) %>% arrange(-`deltaGFP <-> RFP`, GFP) %>% head(1000) %>% DT::datatable(rownames = F)
## does PAX6 regulate PAX6?
#Yes, yes it does.

#GFP specific!
  
# both %>% 
#   # only keep one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # keep up to two genes per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), `Motif Locations` = paste(motif_loc, collapse=', ')) %>% 
#   arrange(-Count) %>% 
#   filter(Gene=='PAX6')

```

---
title: Motif / TFBS Analysis
author: David McGaughey
date: '`r format(Sys.Date(), "%Y-%m-%d")`'
output: 
  html_notebook:
    theme: flatly
    toc: true
    code_folding: hide
---

# Workflow to ID motifs and match to genes

(Full implementation in Snakefile)

1. Download TF motifs from cisbp
2. Only keep TF which are abs(logFc) > 1 between GFP/RPE and iPSC (~1200)
3. Check for motifs in narrow ATAC-seq peaks with fimo
4. Identify closest 2 genes (under 500k bp) to each motif
5. Bootstrap steps 2 and 3 250 times each to get background rate

```{r, message=F, warning=F, results='hide'}
# Load Libraries without printing any warnings or messages
library(tidyverse)
library(ggridges)
library(cowplot)
library(DESeq2)
library(ggrepel)
```

# Motif/TF assessed
```{r, results = 'hide'}
# Load in real data
samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/fimo_motifs/', pattern = '*dat.gz', full.names = T)

fimo_data <- list()
for (i in samples_full){
  sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
  input <- read_tsv(i)
  input <- input %>% mutate(sequence_name = as.character(sequence_name)) %>% mutate(sample = sample)
  fimo_data[[sample]] <- input
}
sample_motifs = bind_rows(fimo_data) %>% mutate(sample=as.factor(sample))

#sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% arrange(-Count)

tf_motif <- sample_motifs %>% select('TF' = `# motif_id`, motif_alt_id) %>% unique()
tf_motif %>% DT::datatable()
```


```{r, warning=F, message=F, echo=F, results='hide'}
# Load in bootstraps
bootstrap_stats = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/fimo_motifs/bootstrapping_stats/', pattern = '*dat.gz', full.names = T)
#samples_full = list.files('~/Desktop/fimo_motifs/bootstrapping/', pattern = '*dat.gz', full.names = T)

# big difference here vs above
# not saving full file, as each bootstrap is ~200mb COMPRESSED
# just saving the stats
# right now, counts by motif/sample
fimo_data <- list()
for (i in bootstrap_stats){
  sample <- str_split(str_split(i, '/')[[1]][11], '\\.')[[1]][1]
  bootstrap_num <- str_split(str_split(str_split(i, '/')[[1]][11], '\\.')[[1]][2], '_')[[1]][2]
  sample_boot <- paste0(sample,'_',bootstrap_num)
  stats <- read_tsv(i) %>% mutate(sample = as.factor(sample), bootstrap = bootstrap_num)
  fimo_data[[sample_boot]] <- stats
}
1
bootstrap_counts = bind_rows(fimo_data) %>% mutate(sample=as.factor(sample)) %>% left_join(tf_motif)
```

```{r}
# merge sample counts with bootstrap counts
all <- bind_rows(sample_motifs %>% group_by(sample, motif_alt_id) %>% summarise(Count=n()) %>% ungroup() %>% mutate(bootstrap = 'real'),
                 bootstrap_counts)

# motif ggridges plotter

plotter <- function(motif, scale=TRUE) {
  if (scale == TRUE){
    all <- all %>% filter(motif_alt_id == !!motif) %>% 
      group_by(sample, motif_alt_id) %>% 
      mutate(`Z score` = scale(Count)) %>% 
      ungroup() 
    all %>% 
      mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>% 
      ggplot(aes(x=`Z score`, y=sample)) + 
      geom_density_ridges() +
      geom_point(data = all %>% 
                   filter(bootstrap == 'real', motif_alt_id == !!motif) %>% 
                   ungroup(), aes(x=`Z score`,y=sample), colour='blue', size=2, alpha=0.5) +
      scale_y_discrete(expand = c(0.01, 0)) +
      theme_ridges() + 
      ggtitle(paste0(motif, ' (', 
                     tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
                     ')'))
  }
  else {
    all %>% filter(motif_alt_id == !!motif) %>% 
      mutate(sample = factor(sample, levels=c('iPSC_IIi_9','iPSC_IIJ_10','iPSC_IIK_11','iPSC_IIL_12','RFP_IIE_1','RFP_IIF_2','RFP_IIG_3','RFP_IIH_4','GFP_IIE_5','GFP_IIF_6','GFP_IIG_7','GFP_IIH_8'))) %>% 
      ggplot(aes(x=Count, y=sample)) + 
      geom_density_ridges() +
      geom_point(data = sample_motifs %>% 
                   filter(motif_alt_id == !!motif) %>% 
                   group_by(sample, motif_alt_id) %>%
                   summarise(Count=n()) %>% ungroup(), aes(x=Count,y=sample), colour='blue', size=2, alpha=0.5) +
      scale_y_discrete(expand = c(0.01, 0)) +
      theme_ridges() + 
      ggtitle(paste0(motif, ' (', 
                     tf_motif %>% filter(motif_alt_id == !!motif) %>% pull(TF),
                     ')'))
  }
}
```
# Plot enrichment  specific motif/TFBS

## TET1
```{r}
plotter('M0610_1.02')
```

## DNMT1
```{r}
plotter('M0609_1.02')
```

## PAX6
```{r}
plotter('M6410_1.02')
```

## SOX10
```{r}
plotter('M6470_1.02')
```

## MITF
```{r}
plotter('M6345_1.02')
```

## HES1
```{r}
plotter('M6271_1.02')
```

## SIX3
```{r}
plotter('M1016_1.02')
```

## SIX2
```{r}
plotter('M0952_1.02')
```

# Distribution of TFBS/motif enrichment
## Fold Change
```{r}
delta_motif_FC <- all %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                                                    grepl('RFP', sample) ~ 'RFP',
                                                    TRUE ~ 'iPSC'),
                                   Boot = case_when(bootstrap == 'real' ~ 'real',
                                                    TRUE ~ 'boot')) %>% 
  group_by(Type, motif_alt_id, Boot) %>% 
  mutate(`Z score` = scale(Count)) %>% 
  summarise(Mean=mean(Count)) %>% 
  spread(Boot, Mean) %>%
  ungroup() %>% 
  mutate(FC = real / boot, Type = factor(Type, levels=c('iPSC','RFP','GFP'))) %>% 
  arrange(-FC) %>% 
  left_join(tf_motif) %>% 
  select(-boot, -real) 

delta_motif_FC %>% ggplot(aes(y=Type, x=log2(FC))) + geom_density_ridges() + theme_ridges() 

delta_motif_FC %>% ggplot(aes(y=Type, x=FC)) + geom_density_ridges() + theme_ridges() 
```



```{r}
# calculate z scores for all
Z_scoring <- all %>% 
  # collapse by Type, motif, and whether bootstrap or real
  group_by(sample, motif_alt_id) %>% 
  mutate(`Z score` = scale(Count)) %>% 
  filter(bootstrap == 'real') %>% 
  select(-bootstrap, -TF) %>% 
  left_join(tf_motif)

Z_scoring %>% head()
```
# Distribution of *relative* TFBS/motif enrichment *between* GFP <-> RFP
```{r}
FC_motif_FC <- all %>% mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                                                    grepl('RFP', sample) ~ 'RFP',
                                                    TRUE ~ 'iPSC'),
                                   Boot = case_when(bootstrap == 'real' ~ 'real',
                                                    TRUE ~ 'boot')) %>% 
  # collapse by Type, motif, and whether bootstrap or real
  group_by(Type, motif_alt_id, Boot) %>% 
  summarise(Mean=mean(Count)) %>% 
  # go wide
  spread(Boot, Mean) %>%
  # so you can calculate FC
  mutate(FC = real / boot) %>% 
  arrange(-FC) %>% 
  # add motif name
  left_join(tf_motif) %>% 
  select(-boot, -real) %>%
  # spread again to allow for GFP vs RFP or iPSC comparisons
  spread(Type, FC) %>% 
  filter(GFP > 1.2) %>% 
  mutate(`FC^2 GFP <-> iPSC` = GFP / iPSC, `FC^2 GFP <-> RFP` = GFP / RFP) %>% 
  arrange(-`FC^2 GFP <-> RFP`) 

plot1 <- FC_motif_FC %>% 
  ggplot(aes(x=`FC^2 GFP <-> RFP`)) + 
  geom_density(fill='gray') + 
  theme_minimal() + 
  geom_vline(xintercept = 1.2, color='blue')
plot1
```

# Distribution of *relative* TFBS/motif enrichment *between* GFP <-> iPSC
```{r}
plot2 <- FC_motif_FC %>% 
  ggplot(aes(x=`FC^2 GFP <-> iPSC`)) + 
  geom_density(fill='gray') + 
  theme_minimal()+ 
  geom_vline(xintercept = 1.2, color = 'blue')
plot2 
```

## Both together
```{r}
plot_grid(plot1 + coord_cartesian(xlim=c(0.5,5.5)), plot2 + coord_cartesian(xlim=c(0.5,5.5)), nrow = 2, align = 'hv', axis='tblr')
```

# DESeq2-based analysis
I realized that motifs by samples is similar to genes by samples

We are dropping RFP II E because from the plots, seems rarely enriched in anything
```{r}
cts <- all %>% filter(bootstrap=='real') %>% spread(sample, Count) %>% select(-bootstrap, -TF) %>% data.frame()
row.names(cts) <- cts$motif_alt_id
cts %>% head()
cts <- cts[,-1]

coldata <- all %>% select(sample) %>% unique() %>% filter(sample!='RFP_IIE_1') %>% mutate(Type =  case_when(grepl('GFP', sample) ~ 'GFP',
                                                                            grepl('RFP', sample) ~ 'RFP',
                                                                            TRUE ~ 'iPSC'))

dds <- DESeqDataSetFromMatrix(countData = cts %>% select(one_of(as.character(coldata$sample)) ),
                              colData = coldata,
                              design = ~ Type)
dds$Type <- relevel(dds$Type, ref='RFP')
dds <- DESeq(dds)

res <- results(DESeq(dds))
resLFC <- lfcShrink(dds, coef='Type_GFP_vs_RFP', type='apeglm')
plotMA(resLFC)
plot(hist(resLFC$pvalue))

```

```{r}
# Load in RNA-seq data
gfp_rfp <- read_csv('~/git/ipsc_rpe_RNA-seq/data/GFP_vs_RFP.results.csv')
rpe_ipsc <- read_csv('~/git/ipsc_rpe_RNA-seq/data/RPE_vs_iPSC.results.csv')
gfp_rfp %>% head()
```

## Table of results
The `Z score` is the number of standard deviation of motifs found in the sample peaks above the random set of peaks
```{r}
results <- resLFC %>% 
  data.frame() %>% 
  rownames_to_column('motif_alt_id') %>% 
  left_join(tf_motif) %>% 
  arrange(pvalue) %>% 
  data.frame() %>% 
  left_join(Z_scoring %>%  mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
                          grepl('RFP', sample) ~ 'RFP',
                          TRUE ~ 'iPSC')) %>% 
              filter(Type=='GFP') %>% group_by(motif_alt_id) %>% summarise(`Z score` = mean(`Z score`))) %>% 
  left_join(gfp_rfp %>% mutate(TF=Gene, log2FC_GFP_RFP_RNASeq = log2FoldChange) %>% select(TF, log2FC_GFP_RFP_RNASeq)) 
results %>% filter((`Z score`) > 2.5) %>% arrange(pvalue) %>% DT::datatable()
```

## Volcano
```{r}
volcano_maker <- function(df, title){
  df$Class <- 'Not significant'
  df$Class[df$padj < 0.01 & df$log2FoldChange > 0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
  df$Class[df$padj < 0.01 & df$log2FoldChange < -0.4 & (df$`Z score`) > 2.5] <- "FDR < 0.01 &\nabs(Z score) > 2.5"
  df$Class <- factor(df$Class, levels=c('Not significant', "FDR < 0.01 &\nabs(Z score) > 2.5"))
  plot <- ggplot(data=df,aes(x=log2FoldChange,y=-log10(pvalue))) + 
    geom_point(aes(colour=Class, size=`Z score`), alpha=0.5) +
    scale_colour_manual(values=c("gray","darkred")) + 
    geom_text_repel(data=df %>% filter((padj < 0.01 & log2FoldChange > 0.4 & `Z score` > 2.5) | (padj < 0.01 & log2FoldChange < -0.4 & `Z score` > 2.5)), 
                    aes(label=TF)) +
    # geom_vline(aes(xintercept=-0.5),linetype="dotted") +
    # geom_vline(aes(xintercept=0.5),linetype="dotted") +
    scale_x_continuous(breaks=c(-1,-0.5,0,0.5,1)) +
    ggtitle(title) + theme_minimal()
  return(plot)
}
volcano_maker(results, 'GFP vs RFP TFBS motif counts')

```


## Plots of the top motifs different between GFP and RFP
RFP II E is plotted here, but not used in the differential analysis above
```{r, fig.height=5, fig.width=6}
plots <- list()
for (i in results %>% filter((`Z score`) > 2) %>% arrange(pvalue) %>% head(n=12) %>% pull(motif_alt_id)){
  plots[[i]] <- plotter(i, scale=T)
}


plot_grid(plotlist = plots, ncol=3)
```


```{r}
# Gene <-> motif matching
# load gtf to map transcript to gene_name

gtf <- read_tsv("/Volumes/data/genomes/1000G_phase2_GRCh37/gencode.v28lift37.metadata.HGNC.gz", col_names = c('Transcript','Gene'))
# gtf <- gtf %>% rowwise() %>% mutate(transcript = ((str_split(V9, ';')[[1]] %>% str_extract('transcript_id.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2],
#                              gene = ((str_split(V9, ';')[[1]] %>% str_extract('gene_name.*') %>% na.omit())[1] %>% str_split(., '\"'))[[1]][2])
```

```{r}
## Sample data
### Closest TSS for OTX2 (M5700_1.02)

# samples_full = list.files('/Volumes/data/projects/nei/hufnagel/iPSC_RPE_ATAC_Seq/closest_TSS_motifs/', 
#                           pattern = '*dat.gz', full.names = T)
# 
# closestTSS_data <- list()
# for (i in samples_full){
#   sample <- str_split(str_split(i, '/')[[1]][10], '\\.')[[1]][1]
#  # input <- read_tsv(i, col_names = c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript',  'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance'))
#   input <- fread(paste('gzcat', i, '| grep M5700_1'))
#   colnames(input) <- c('sequence_name', 'start', 'end', 'motif', 'fimo_pvalue', 'strand', 'tss_seq','tss_start','tss_end', 'transcript',  'blank','tss_strand','coord1','coord2','blank2','exon_num','size','exon_pos','distance')
#   input <- input %>% mutate(sequence_name = as.character(sequence_name),
#                             tss_seq = as.character(tss_seq),
#                             coord1=as.numeric(coord1),
#                             coord2=as.numeric(coord2),
#                             blank2 = as.character(blank2),
#                             exon_num=as.integer(exon_num),
#                             size=as.numeric(size)) %>% mutate(sample = sample)
#   closestTSS_data[[sample]] <- input
# }
# sample_closestTSS = bind_rows(closestTSS_data) %>% mutate(sample=as.factor(sample)) %>% rowwise() %>% mutate(Transcript = str_split(transcript, '_')[[1]][1])
# 
# both <- left_join(sample_closestTSS, gtf) %>% 
#   filter(!is.na(Gene)) %>% 
#   mutate(motif_loc = paste(sequence_name, start, end, sep='_'))
# 
# both %>% 
#   # remove any TSS over 500kb away
#   filter(distance < 500000) %>% 
#   #  one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # add up to two genes (total) per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), paste(motif_loc, collapse=', ')) %>% 
#   arrange(-Count)
```

```{r}

# What genes have more PAX6 'associated' motifs compared from GFP to RFP

# enriched_genes <- both %>% 
#   # only keep one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # keep up to two genes per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), paste(motif_loc, collapse=', ')) %>% 
#   ungroup() %>% 
#   # collapse to GFP/RFP/iPSC
#   mutate(Type = case_when(grepl('GFP', sample) ~ 'GFP',
#                           grepl('RFP', sample) ~ 'RFP',
#                           TRUE ~ 'iPSC')) %>% 
#   group_by(Gene, Type) %>% 
#   summarise(Total=sum(Count)) %>% 
#   arrange(-Total) %>% 
#   spread(Gene, Total) %>% t() 
# 
# colnames(enriched_genes) <- enriched_genes[1,]
# enriched_genes <- enriched_genes[-1,] %>% data.frame() %>% rownames_to_column('Gene') %>%  mutate(GFP = as.numeric(GFP), iPSC = as.numeric(iPSC), RFP = as.numeric((RFP)))
# 
# enriched_genes[is.na(enriched_genes)] <- 0
# 
# enriched_genes %>% mutate(`deltaGFP <-> RFP` = GFP - RFP) %>% arrange(-`deltaGFP <-> RFP`, GFP) %>% head(1000) %>% DT::datatable(rownames = F)

```


```{r}
## does PAX6 regulate PAX6?
#Yes, yes it does.

#GFP specific!
  
# both %>% 
#   # only keep one gene per motif
#   group_by(motif_loc, sample, Gene) %>% 
#   top_n(1, distance) %>% 
#   ungroup() %>% 
#   # keep up to two genes per motif
#   group_by(motif_loc, sample) %>% 
#   top_n(2, distance) %>% 
#   ungroup() %>% 
#   # arrange by genes most linked to motif  
#   group_by(Gene, sample) %>% 
#   summarise(Count=n(), `Motif Locations` = paste(motif_loc, collapse=', ')) %>% 
#   arrange(-Count) %>% 
#   filter(Gene=='PAX6')
```

```
